1 Load necessary libraries

#load necessary libraries
options(warn=-1)
library(astsa) #provides time series model functions (arima, sarima)
library(TSstudio) #plotly library for plotting time series data
library(xts) #library to convert data frame to time series object
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked from 'package:astsa':
## 
##     gas

1.0.1 Load Quaterly earnings of J&J data and plot the time series data

ts_plot(JohnsonJohnson,title='Quaterly earnings of J&J share',
        Xtitle='Year',Ytitle='Share price')

2 Verifying trend and seasonality in the data

ts_decompose(JohnsonJohnson)

It can be clearly seen that there is both trend and seasonality in the data.

3 Remove Trend and seasonality

In order to remove the trend, we use the difference operator on the share price. Specifically, a new series is constructed where the value at the current time step is calculated as the difference between the original observation and the observation at the previous time step.

3.0.1 value(t) = observation(t) - observation(t-1)

#1. Log-return transformation 
ts_decompose(diff(log(JohnsonJohnson))) #d=1
acf(diff(log(JohnsonJohnson)),main="ACF") #there is seasonality after every 4 lags 

pacf(diff(log(JohnsonJohnson)),main='pacf')

It can be observed that the trend has been removed upto a great extent.

3.0.2 In order to remove the seasonality, we double differentiate the share price.

#2. Seasonal differencing 
ts_plot(diff(diff(log(JohnsonJohnson)),4))
ts_decompose(diff(diff(log(JohnsonJohnson))))

4 Ljung-Box test to check for correlations between lags

4.0.1 Null Hypothesis (H0): There is no autocorrelation between the lags

4.0.2 Alternate Hypothesis (H1): There is significant autocorrelation between the lags

#3. Ljung Box test
Box.test(diff(diff(JohnsonJohnson)),lag=log(length(JohnsonJohnson)))
## 
##  Box-Pierce test
## 
## data:  diff(diff(JohnsonJohnson))
## X-squared = 128.99, df = 4.4308, p-value < 2.2e-16
par(mfrow=c(2,1))
acf(diff(diff(log(JohnsonJohnson)),4))
pacf(diff(diff(log(JohnsonJohnson)),4))

p-value is less than 0.05 and hence, autocorrelation still exists between the lags and cannot be eliminated/reduced further.

A more appropriate method to check correlations between the lags is by using ts_lags()- produces a scatter plot for each lag

Autocorrelation between lags on raw data

ts_lags(JohnsonJohnson)

Extremely high correlations are found between each of the lags (linear plots/ relationships)

After double differentiating and taking log-transform, the autocorrelation is removed to a certain extent.

ts_lags(diff(diff(log(JohnsonJohnson))))

5 Select best model

d=1 #since we have differenced the Time Series once
D=1
s=4 #duration

for(p in 1:2)
{for(q in 1:2)
{for(P in 1:2)
{for(Q in 1:2)
{if(p+d+q+P+D+Q<=10)
{model<-arima(x=log(JohnsonJohnson),order=c(p-1,d,q-1),seasonal=list(order=c(P-1,D,Q-1),period=s))
test<-Box.test(model$residuals,lag=log(length(model$residuals)))
sse=sum(model$residuals^2)
cat(p-1,d,q-1,P-1,D,Q-1, "AIC:",model$aic,"SSE:",sse,'p-value:',test$p.value,'\n')

}
}
}
}
}
## 0 1 0 0 1 0 AIC: -124.0685 SSE: 0.9377872 p-value: 0.0002610792 
## 0 1 0 0 1 1 AIC: -126.3493 SSE: 0.8856995 p-value: 0.0001606501 
## 0 1 0 1 1 0 AIC: -125.9198 SSE: 0.8908546 p-value: 0.0001978113 
## 0 1 0 1 1 1 AIC: -124.3648 SSE: 0.8854555 p-value: 0.0001574029 
## 0 1 1 0 1 0 AIC: -145.5139 SSE: 0.6891989 p-value: 0.03543717 
## 0 1 1 0 1 1 AIC: -150.7528 SSE: 0.6265214 p-value: 0.6089542 
## 0 1 1 1 1 0 AIC: -150.9134 SSE: 0.6251635 p-value: 0.7079173 
## 0 1 1 1 1 1 AIC: -149.1317 SSE: 0.6232876 p-value: 0.6780876 
## 1 1 0 0 1 0 AIC: -139.8248 SSE: 0.7467495 p-value: 0.03503386 
## 1 1 0 0 1 1 AIC: -146.0191 SSE: 0.6692692 p-value: 0.5400206 
## 1 1 0 1 1 0 AIC: -146.0319 SSE: 0.6689661 p-value: 0.5612965 
## 1 1 0 1 1 1 AIC: -144.3766 SSE: 0.6658382 p-value: 0.5459446 
## 1 1 1 0 1 0 AIC: -145.8284 SSE: 0.667109 p-value: 0.2200492 
## 1 1 1 0 1 1 AIC: -148.7706 SSE: 0.6263678 p-value: 0.594822 
## 1 1 1 1 1 0 AIC: -148.9175 SSE: 0.6251104 p-value: 0.7195471 
## 1 1 1 1 1 1 AIC: -144.4483 SSE: 0.6097742 p-value: 0.3002703
#hence we chose the model (0,1,1,1,1,0)

The model with the lowest AIC is (0,1,1,1,1,0) and one of the highest p-value.

#5. plotting seasonal arima of the model
sarima(log(JohnsonJohnson),0,1,1,1,1,0,4)
## initial  value -2.237259 
## iter   2 value -2.429075
## iter   3 value -2.446737
## iter   4 value -2.455821
## iter   5 value -2.459761
## iter   6 value -2.462511
## iter   7 value -2.462602
## iter   8 value -2.462749
## iter   9 value -2.462749
## iter   9 value -2.462749
## iter   9 value -2.462749
## final  value -2.462749 
## converged
## initial  value -2.411490 
## iter   2 value -2.412022
## iter   3 value -2.412060
## iter   4 value -2.412062
## iter   4 value -2.412062
## iter   4 value -2.412062
## final  value -2.412062 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     include.mean = !no.constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc, 
##         REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ma1     sar1
##       -0.6796  -0.3220
## s.e.   0.0969   0.1124
## 
## sigma^2 estimated as 0.007913:  log likelihood = 78.46,  aic = -150.91
## 
## $degrees_of_freedom
## [1] 77
## 
## $ttable
##      Estimate     SE t.value p.value
## ma1   -0.6796 0.0969 -7.0104  0.0000
## sar1  -0.3220 0.1124 -2.8641  0.0054
## 
## $AIC
## [1] -1.910297
## 
## $AICc
## [1] -1.908298
## 
## $BIC
## [1] -1.820318
#acf plot shows that there is no autocorrelation between previous lags since no spike is above the noise line 
#p-value verified that there is no autocorrelation 
#Q-Q plot for residuals is almost linear
#6. forecasting 
model.fit<-arima(x=(JohnsonJohnson),order=c(0,1,1),seasonal=list(order=c(1,1,0),period=4))
print(model.fit)
## 
## Call:
## arima(x = (JohnsonJohnson), order = c(0, 1, 1), seasonal = list(order = c(1, 
##     1, 0), period = 4))
## 
## Coefficients:
##           ma1    sar1
##       -0.7835  0.1382
## s.e.   0.0674  0.1167
## 
## sigma^2 estimated as 0.1919:  log likelihood = -47.36,  aic = 100.71

The AIC of the above model is somewhat less. Plotting the results of the forecast. # Forecasting {.tabset}

plot_forecast(forecast(model.fit))
forecast(model.fit)
##         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1981 Q1       17.87703 17.31558 18.43847 17.01837 18.73568
## 1981 Q2       16.28482 15.71037 16.85926 15.40628 17.16336
## 1981 Q3       17.56016 16.97300 18.14733 16.66218 18.45815
## 1981 Q4       13.21237 12.61276 13.81198 12.29535 14.12940
## 1982 Q1       19.48728 18.51876 20.45581 18.00606 20.96851
## 1982 Q2       17.88647 16.88369 18.88926 16.35285 19.42010
## 1982 Q3       19.15150 18.11559 20.18741 17.56722 20.73579
## 1982 Q4       14.81231 13.74430 15.88032 13.17893 16.44569

We can see that the model has captured the trend and seasonality in the data and predicted on similar lines.